#!/usr/bin/env python3
import json, re
import numpy as np, pandas as pd

IN_CSV  = "outputs/lensing_plateau.csv"
OUT_JSON= "outputs/size_regression.json"  # overwrite with weighted result

def rg_mid(label: str) -> float:
    if not isinstance(label, str): return float("nan")
    s = label.strip().replace("—","-").replace("–","-")
    m = re.match(r"\s*([0-9.]+)\s*-\s*([0-9.]+)\s*", s)
    if not m: return float("nan")
    a, b = float(m.group(1)), float(m.group(2))
    return 0.5*(a+b)

def wls_slope(x, y, w):
    x, y, w = np.asarray(x,float), np.asarray(y,float), np.asarray(w,float)
    W = w.sum()
    if not np.isfinite(W) or W <= 0: return float("nan")
    xb = (w*x).sum()/W
    yb = (w*y).sum()/W
    num = (w*(x-xb)*(y-yb)).sum()
    den = (w*(x-xb)**2).sum()
    return float(num/den) if den>0 else float("nan")

def main():
    df = pd.read_csv(IN_CSV)
    ok = (df["claimable"].astype(str).str.lower() == "true")
    use = df.loc[ok, ["Mstar_bin","R_G_bin","A_theta","rmse_flat","n_lenses","R2_flat"]].copy()

    use["RG_mid"]    = use["R_G_bin"].apply(rg_mid)
    use["A_theta"]   = pd.to_numeric(use["A_theta"], errors="coerce")
    use["rmse_flat"] = pd.to_numeric(use["rmse_flat"], errors="coerce")
    use["n_lenses"]  = pd.to_numeric(use["n_lenses"], errors="coerce")
    use["R2_flat"]   = pd.to_numeric(use["R2_flat"], errors="coerce").fillna(0.0)

    use = use.replace([np.inf,-np.inf], np.nan).dropna(subset=["A_theta","rmse_flat","RG_mid"])
    if use.empty:
        json.dump({}, open(OUT_JSON,"w"), indent=2); print("no usable claimables"); return

    out = {}
    rng = np.random.default_rng(42)
    B   = 4000
    eps = 1e-8

    for ms, g in use.groupby("Mstar_bin"):
        x = g["RG_mid"].to_numpy(float)
        y = g["A_theta"].to_numpy(float)
        # base weight from flatness (flatter ⇒ larger weight)
        w = 1.0 / (g["rmse_flat"].to_numpy(float)**2 + eps)
        # optional boosts: more lenses, better R2 (capped gently)
        if g["n_lenses"].notna().any():
            nL = np.clip(g["n_lenses"].to_numpy(float), 1.0, 1e6)
            w *= np.sqrt(nL)
        w *= np.clip(g["R2_flat"].to_numpy(float) + 0.05, 0.05, 1.05)

        n = len(g)
        if n < 3:
            out[ms] = {"n_stacks": int(n), "slope_Atheta_vs_RG": float("nan"),
                       "CI_16": float("nan"), "CI_84": float("nan"), "weighted": True}
            continue

        mhat = wls_slope(x, y, w)
        # weighted bootstrap: resample stacks with prob ∝ w
        p = w / w.sum()
        boots = np.empty(B, float)
        for i in range(B):
            idx = rng.choice(n, size=n, replace=True, p=p)
            boots[i] = wls_slope(x[idx], y[idx], w[idx])
        boots = boots[np.isfinite(boots)]
        lo, hi = np.percentile(boots, [16,84]) if boots.size else (float("nan"), float("nan"))

        out[ms] = {"n_stacks": int(n),
                   "slope_Atheta_vs_RG": float(mhat),
                   "CI_16": float(lo), "CI_84": float(hi),
                   "weighted": True}

    json.dump(out, open(OUT_JSON,"w"), indent=2)
    print(f"Wrote {OUT_JSON} (weighted).")

if __name__ == "__main__":
    main()
